1. Data preparation

Read data

SE object created from exploratory analysis.

Gene selection according to Biotype already performed

SE_Bio <- readRDS(paste0(params$SE_Bio))

Convert ENSG annotation to Gene Symbol before Differential Expression Analysis and duplicate gene names removal

Duplicated gene names are dropped and gene IDs are set as rownames.

The number of duplicated GeneName is: 11899

The number of duplicated Ensembl Genes with version is: 0

SE_Bio <- SE_Bio[!duplicated(rowData(SE_Bio)$GeneName), ]
rownames(SE_Bio) <- rowData(SE_Bio)$GeneName
SE_Bio
## class: SummarizedExperiment 
## dim: 24708 36 
## metadata(0):
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(36): OLE_001 OLE_002 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size
ScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')

padj_threshold = params$padj_threshold
log2fc_threshold = params$log2fc_threshold

Filtering Thresholds are set to:

  • Log2FC = 1.5
  • FDR = 0.05

Sample selection

Only day25 Cortical Brain Organoids are kept

SE_Bio_d25CbO <- SE_Bio[, colData(SE_Bio)$Timepoint %in% 'd25']
colData(SE_Bio_d25CbO)
## DataFrame with 13 rows and 11 columns
##         SampleNumber SampleID_GU SampleID_OG             SampleName        SampleType    Genotype
##            <integer> <character> <character>            <character>       <character> <character>
## OLE_007            5    D25_2_WT    D25_2_WT ORG_CHD2_WT_round2A_.. CorticalOrganoids          WT
## OLE_008            6    D25_2_HT    D25_2_HT ORG_CHD2_HT_round2A_.. CorticalOrganoids          HT
## OLE_013            9   D25_3b_WT   D25_3b_WT ORG_CHD2_WT_round3_d.. CorticalOrganoids          WT
## OLE_014           10   D25_3b_HT   D25_3b_HT ORG_CHD2_HT_round3_d.. CorticalOrganoids          HT
## OLE_031           18     OLE_031        OL06         6_Evo1_d25_WT3 CorticalOrganoids          WT
## ...              ...         ...         ...                    ...               ...         ...
## OLE_035           22     OLE_035        OL10    10_Evo3_d25_WTC_1-2 CorticalOrganoids          WT
## OLE_036           23     OLE_036        OL11     11_Evo3_d25_A5_1-2 CorticalOrganoids          AR
## OLE_037           24     OLE_037        OL12      12_Evo4_d25_WTC-A CorticalOrganoids          WT
## OLE_038           25     OLE_038        OL13       13_Evo4_d25_A5-A CorticalOrganoids          AR
## OLE_039           26     OLE_039        OL14      14_Evo4_d25_HET-A CorticalOrganoids          HT
##           Timepoint       Batch    SeqRun        HRID  lib.size
##         <character> <character> <integer> <character> <numeric>
## OLE_007         d25      Round2         1     OLE_007  17223455
## OLE_008         d25      Round2         1     OLE_008  18879578
## OLE_013         d25      Round3         1     OLE_013  20419528
## OLE_014         d25      Round3         1     OLE_014  41252167
## OLE_031         d25        Evo1         2     OLE_031  24061399
## ...             ...         ...       ...         ...       ...
## OLE_035         d25        Evo3         2     OLE_035  18643755
## OLE_036         d25        Evo3         2     OLE_036  21633783
## OLE_037         d25        Evo4         2     OLE_037  22432181
## OLE_038         d25        Evo4         2     OLE_038  20928321
## OLE_039         d25        Evo4         2     OLE_039  19103731

DESeq2

2. Generation of the dds object

  • DDS object is generated from the Count Matrix and Sample Metadata stored in the SE_Bio object
  • Genotype is specified for the design: Ancestral, Wildtype.
CountMatrix <- assays(SE_Bio_d25CbO)$counts
SampleMeta <- DataFrame(colData(SE_Bio_d25CbO))

all(rownames(SampleMeta) == colnames(CountMatrix))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData=assays(SE_Bio_d25CbO)$counts, DataFrame(colData(SE_Bio_d25CbO)), design = ~Batch+Genotype)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are
## characters, converting to factors

mcols(dds) <- DataFrame(mcols(dds), DataFrame(rowData(SE_Bio_d25CbO)))

dds$Genotype <- factor(dds$Genotype, levels = c("WT", "AR", 'HT')) #no need to specify as column is already ordered, but safer
dds$Genotype <- relevel(dds$Genotype, ref = "WT")


dds$Genotype
##  [1] WT HT WT HT WT AR HT HT WT AR WT AR HT
## Levels: WT AR HT

dds
## class: DESeqDataSet 
## dim: 24708 13 
## metadata(1): version
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(13): OLE_007 OLE_008 ... OLE_038 OLE_039
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size

Inspection of genes with zero counts

ZeroPlot <- CountMatrix %>% 
  mutate(row = row_number()) %>%
  pivot_longer(cols = -row, names_to = "col", values_to = "value") %>% 
  filter(value == 0) %>%
  group_by(col) %>%
  summarise(zerocount = n())  %>%
  ggplot(., aes(y=col, x=zerocount)) +
  geom_col(col='black', fill='#76c8c8') +
  coord_flip() + 
  geom_label(aes(label=zerocount)) + 
  labs(title=paste0('Number of genes with zero counts ', '(out of ', nrow(CountMatrix), ' genes)'),
       y='', x='') +
  theme_bw() +
  theme(axis.text = element_text(colour = 'black', size=7),
        axis.text.x = element_text(angle=45, hjust = 0.5, vjust=0.5),
        plot.title = element_text(hjust = 0.5, size = 7))

ZeroPlot

Filtering of the dds object

  • I focus on protein-coding and lncRNA genes only (other biotypes were already removed from the SE_Bio object)
  • I implement a filter on the expression of at least 5 reads in at least 2 samples
keep <- rowSums(counts(dds)>=5) >= 2 #changed from 5 ##changed from 10 and 2 samples from 3

table(keep)
## keep
## FALSE  TRUE 
##  6927 17781
dds <- dds[keep,]

dds
## class: DESeqDataSet 
## dim: 17781 13 
## metadata(1): version
## assays(1): counts
## rownames(17781): TSPAN6 TNMD ... PDCD6-AHRR LNCDAT
## rowData names(9): Gene EnsGene ... Start End
## colnames(13): OLE_007 OLE_008 ... OLE_038 OLE_039
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size

A dds object containing information about 17781 genes in 13 samples is tested for differential expression.


3. Differential Expression

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sizeFactors(dds)
##   OLE_007   OLE_008   OLE_013   OLE_014   OLE_031   OLE_032   OLE_033   OLE_034   OLE_035 
## 0.7693623 0.8724444 0.9194546 1.8788253 1.1027455 1.0629631 0.9634343 1.0544527 0.8574659 
##   OLE_036   OLE_037   OLE_038   OLE_039 
## 0.9944899 1.0366067 0.9750425 0.8666412

Inspection of the dds object

Top50 genes heatmap

select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:50]
ntd <- normTransform(dds)
df <- as.data.frame(colData(dds)[,c("Genotype")])
colnames(df) <- 'Genotype'
#df$Run <- c(rep('rep1', 3), rep('rep2', 3))

rownames(df) <- colnames(ntd)

pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=TRUE, annotation_col=df, border_color = 'black')

Samples distances

vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
#rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
#colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors, main = 'Samples Distances', name = 'vst')

Counts Distribution

SF <- data.frame(Sample=names(sizeFactors(dds)), SizeF=sizeFactors(dds))

par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds)),  col=ScaledCols, cex.axis=0.7, 
        las=1, xlab="log2(counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds, normalized=TRUE)),  col=ScaledCols, cex.axis=0.7, 
        las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") 
plotDensity(log2(counts(dds)),  col=ScaledCols, 
            xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) 
plotDensity(log2(counts(dds, normalized=TRUE)), col=ScaledCols, 
            xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) 

Dispersion estimate

plotDispEsts(dds)


Extract Result of contrast: AR vs WT

res_dds_ar <- results(dds, contrast=c("Genotype","AR","WT"), alpha=0.05, cooksCutoff=0.99)
summary(res_dds_ar)
## 
## out of 17781 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 81, 0.46%
## LFC < 0 (down)     : 90, 0.51%
## outliers [1]       : 0, 0%
## low counts [2]     : 1379, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

metadata(res_dds_ar)$filterThreshold
## 7.755102% 
##  5.155949
metadata(res_dds_ar)$alpha
## [1] 0.05

plot(metadata(res_dds_ar)$filterNumRej, 
     type="b", ylab="number of rejections",
     xlab="quantiles of filter", main='Ancestral',
     cex.lab=0.5, cex.axis=0.5, cex.main=0.5)
lines(metadata(res_dds_ar)$lo.fit, col="red")
abline(v=metadata(res_dds_ar)$filterTheta)

Top results from res table sorted by adjusted Pvalue for AR vs WT

head(res_dds_ar[order(res_dds_ar$padj),])
## log2 fold change (MLE): Genotype AR vs WT 
## Wald test p-value: Genotype AR vs WT 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## PCDHA2      172.146       -3.63859  0.335444 -10.84706 2.05951e-27 2.81403e-23
## LINC01515   195.870       -5.03618  0.466301 -10.80029 3.43132e-27 2.81403e-23
## SLC15A4     124.413       -4.02533  0.397896 -10.11655 4.66586e-24 2.55098e-20
## FZD10      1452.125       -5.87179  0.620686  -9.46017 3.07443e-21 1.26067e-17
## ZNF100      556.842       -1.45890  0.156480  -9.32320 1.12888e-20 3.70317e-17
## UBE2T       701.970       -1.01053  0.113299  -8.91919 4.69711e-19 1.28403e-15

  • Genes modulated considering a FDR threshold of 0.1: 268

  • Genes modulated considering a FDR threshold of 0.05: 171

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 111

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 74


Fold-change shrinkage

Since I am using the constrast option to retrieve the results, I cannot rely on apleglm default algorithm for logFC shrinkage. Since at the moment I am not using the lfcThreshold option, I decide for the ashr algorithm.

resAshr_ar <- lfcShrink(dds, contrast=c("Genotype","AR","WT"), res=res_dds_ar, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
summary(resAshr_ar)
## 
## out of 17781 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 81, 0.46%
## LFC < 0 (down)     : 90, 0.51%
## outliers [1]       : 0, 0%
## low counts [2]     : 1379, 7.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Ancestral

#par(mfrow=c(1,2), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-6,6)
DESeq2::plotMA(res_dds_ar, xlim=xlim, ylim=ylim, main="no LFC shrink")

DESeq2::plotMA(resAshr_ar, xlim=xlim, ylim=ylim, main="LFC shrink with ashr algorithm")


Top results from res table sorted by adjpvalue after LFC shrinkage

head(resAshr_ar[order(resAshr_ar$padj),])
## log2 fold change (MMSE): Genotype AR vs WT 
## Wald test p-value: Genotype AR vs WT 
## DataFrame with 6 rows and 5 columns
##            baseMean log2FoldChange     lfcSE      pvalue        padj
##           <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## PCDHA2      172.146      -3.602244  0.334754 2.05951e-27 2.81403e-23
## LINC01515   195.870      -4.945350  0.462591 3.43132e-27 2.81403e-23
## SLC15A4     124.413      -3.970674  0.395832 4.66586e-24 2.55098e-20
## FZD10      1452.125      -5.691165  0.613107 3.07443e-21 1.26067e-17
## ZNF100      556.842      -1.395251  0.157343 1.12888e-20 3.70317e-17
## UBE2T       701.970      -0.980601  0.112123 4.69711e-19 1.28403e-15

  • Genes modulated considering a FDR threshold of 0.1: 268

  • Genes modulated considering a FDR threshold of 0.05: 171

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 80

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 40


Log Fold change shrinkage doesn’t change the results significantly, thus I decided to keep the standard DESeq2 workflow for testing differential gene expression.


Extract DEGs

  • Here I extract the DEGs passing adjusted Pvalue and logFC thresholds, for AR vs WT contrast (deseqDEGsAR)
deseqDEGsAR <- dplyr::filter(data.frame(res_dds_ar), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))

deseqDEGsARashr <- dplyr::filter(data.frame(resAshr_ar), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))

Results Visualization

SE_deseq2 <- as(dds, "RangedSummarizedExperiment")
assays(SE_deseq2)$vst <- assay(vst(dds, blind=TRUE))
FdrCeil = 1e-10
logFcCeil = 7.5

Plotting Ceilings are set to:

  • Log2FC 7.5
  • FDR 10^{-10}

Volcano

Ancestral

#rename(as.data.frame(res_dds_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'AR vs WT')

dplyr::rename(as.data.frame(res_dds_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'AR vs WT (day25 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)

Ancestral LFC shrink

#rename(as.data.frame(res_dds_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'AR vs WT')

dplyr::rename(as.data.frame(resAshr_ar), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'AR vs WT (day25 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)


9. Savings

DEAList <- list()

DEAList <- list(dds = dds,  #same for both
                AR = list(res = res_dds_ar, 
                          DEGs = deseqDEGsAR,
                          resAshr =resAshr_ar,
                          DEGsAshr = deseqDEGsARashr))
# RDS
saveRDS(DEAList, paste0(SavingFolder, '/day25CbO.', 'DEAList_AR.rds')) 

saveRDS(SE_deseq2, paste0(SavingFolder, '/day25CbO.', 'SE_deseq2_AR.rds')) 

saveRDS(res_dds_ar, paste0(SavingFolder, '/day25CbO.', 'deseqARvsWT.rds')) 

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(SavingFolder, '/day25CbO.', 'DEAnalysisWorkspace_AR.RData'))

last update on: Fri Jan 10 18:52:43 2025